library(aplore3)
library(ggplot2)
library(plotly)
library(RColorBrewer)
library(pheatmap)
library(cluster)
library(dplyr)
library(tidyr)

Data Format: A data.frame with 500 rows and 18 variables such as:

priorfrac - If the patient previously had a fracture
age
weight
height
bmi
premeno
momfrac
armassist
smoke
raterisk
fracscore
fracture
bonemed - Bone medications at enrollment (1: No, 2: Yes)
bonemed_fu - Bone medications at follow-up (1: No, 2: Yes)
bonetreat - Bone medications both at enrollment and follow-up (1: No, 2: Yes)

head(glow_bonemed)
##   sub_id site_id phy_id priorfrac age weight height      bmi premeno momfrac
## 1      1       1     14        No  62   70.3    158 28.16055      No      No
## 2      2       4    284        No  65   87.1    160 34.02344      No      No
## 3      3       6    305       Yes  88   50.8    157 20.60936      No     Yes
## 4      4       6    309        No  82   62.1    160 24.25781      No      No
## 5      5       1     37        No  61   68.0    152 29.43213      No      No
## 6      6       5    299       Yes  67   68.0    161 26.23356      No      No
##   armassist smoke raterisk fracscore fracture bonemed bonemed_fu bonetreat
## 1        No    No     Same         1       No      No         No        No
## 2        No    No     Same         2       No      No         No        No
## 3       Yes    No     Less        11       No      No         No        No
## 4        No    No     Less         5       No      No         No        No
## 5        No    No     Same         1       No      No         No        No
## 6        No   Yes     Same         4       No      No         No        No
mysummary = glow_bonemed %>%
  select(age, weight, height, bmi, fracscore) %>% # select variables to summarise
  summarise_each(funs(min = min, 
                      q25 = quantile(., 0.25), 
                      median = median, 
                      q75 = quantile(., 0.75), 
                      max = max,
                      mean = mean, 
                      sd = sd,
                      variance= var))

# reshape it using tidyr functions

clean.summary = mysummary %>% gather(stat, val) %>%
  separate(stat, into = c("var", "stat"), sep = "_") %>%
  spread(stat, val) %>%
  select(var, min, max, mean, sd,variance) # reorder columns
print(clean.summary)
##         var       min       max      mean        sd   variance
## 1       age  55.00000  90.00000  68.56200  8.989537  80.811780
## 2       bmi  14.87637  49.08241  27.55303  5.973958  35.688178
## 3 fracscore   0.00000  11.00000   3.69800  2.495446   6.227251
## 4    height 134.00000 199.00000 161.36400  6.355493  40.392289
## 5    weight  39.90000 127.00000  71.82320 16.435992 270.141825
summary(glow_bonemed %>% select(priorfrac, premeno, momfrac, armassist, smoke, raterisk, fracture, bonemed, bonemed_fu, bonetreat))
##  priorfrac premeno   momfrac   armassist smoke        raterisk   fracture 
##  No :374   No :403   No :435   No :312   No :465   Less   :167   No :375  
##  Yes:126   Yes: 97   Yes: 65   Yes:188   Yes: 35   Same   :186   Yes:125  
##                                                    Greater:147            
##  bonemed   bonemed_fu bonetreat
##  No :371   No :361    No :382  
##  Yes:129   Yes:139    Yes:118  
## 

Age vs Weight: As weight increases the average age decreases
Age vs Height: Weak correlation of as height increases age decreases
Age vs BMI: As bmi increases the average age decreases
Age vs fracscore: As age increases the average fracscore increases

Weight vs Height: As height increases the average weight increases
Weight vs BMI: As bmi increases the average weight increases
Weight vs fracscore: As fracscore increases the average Weight decreases

Height vs BMI: As bmi increases the average height and variance stay the same
Height vs fracscore: As fracscore increases the average height stays the same though variance might decrease

BMI vs fracscore: As fracscore increases the average bmi decreases

plot(glow_bonemed[, c(5:8, 14)])

Non of the following scatter plots show strong groupings for the fracture/no fracture categorical variable

ggplot(glow_bonemed, aes(x = age, y = bmi, color = fracture)) +
  geom_jitter()+
  labs(title = "BMI vs Age")

ggplot(glow_bonemed, aes(x = bmi, y = fracscore, color = fracture)) +
  geom_jitter()+
  labs(title = "Fracture Score vs BMI")

ggplot(glow_bonemed, aes(x = fracscore, y = age, color = fracture)) +
  geom_jitter()+
  labs(title = "Age vs Fracture Score")

ggplot(glow_bonemed, aes(x = weight, y = height, color = fracture)) +
  geom_jitter()+
  labs(title = "Height vs Weight")

Once again there doesn’t seem to be strong groupings of the fracture categorical variable

fracture3dplot = plot_ly(glow_bonemed, 
  x = ~age, 
  y = ~height, 
  z = ~bmi, 
  color = ~fracture, 
  colors = c('#0C4B8E', '#BF382A')) %>% add_markers()

fracture3dplot

The boxplot shows that the mean fracscore seems to be slightly higher for smokers compared to non smokers

ggplot(glow_bonemed, aes(x = smoke, y = fracscore)) +
  geom_boxplot()+
  labs(title = "Fracture Score Summary Statistics for Smokers vs Non Smokers")

Clustering EDA

pheatmap(glow_bonemed[, c(5,8)], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

pheatmap(glow_bonemed[, 5:8], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))

zScoreScale = scale(glow_bonemed[, 5:8])

zScoreDistance = dist(zScoreScale)

continuousVariableClustering = hclust(zScoreDistance, method = "complete")

plot(continuousVariableClustering)

Sources:
Hosmer, D.W., Lemeshow, S. and Sturdivant, R.X. (2013) Applied Logistic Regression, 3rd ed., New York: Wiley

https://cran.r-project.org/web/packages/aplore3/aplore3.pdf#page=11&zoom=100,132,90